package com.jacken.mars.utils;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import okhttp3.Request;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;

/**
 * @author wangqinag
 * @version 1.0
 * @date 2020/12/28 11:29
 */
public class WebCrawler extends BreadthCrawler {
    File downloadDir;

    private final static String downPath = "D:/toolSource/picFile";

    private final static String seed = "https://www.csdn.net";

    public static class MyRequester extends OkHttpRequester {

        String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36";
        String cookie = "JSESSIONID=asdasdasdasdasdasdasdsadsa";
        // 每次发送请求前都会执行这个方法来构建请求
        @Override
        public Request.Builder createRequestBuilder(CrawlDatum crawlDatum) {
            return super.createRequestBuilder(crawlDatum)
                    .addHeader("User-Agent", userAgent)
                    .addHeader("Cookie", cookie);
        }

    }

    public WebCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);
        downloadDir = new File(downPath);
        if (!downloadDir.exists()) {
            downloadDir.mkdirs();
        }
        this.setRequester(new MyRequester());
        this.addSeed(seed);
        setThreads(50);
        getConf().setTopN(100);

    }

    @Override
    public void visit(Page page, CrawlDatums next) {
        if(page.code() == 301 || page.code() == 302){
            next.addAndReturn(page.location()).meta(page.meta());
            return;
        }

        String url = page.url();
        System.out.println("url:"+url);
        String contentType = page.contentType();
        System.out.println("contentType:"+contentType);
        if (contentType == null) {
            return;
        } else if (contentType.contains("html")) {
            // 如果是网页，则抽取其中包含图片的URL，放入后续任务
            Elements imgs = page.select("img[src]");
            for (Element img : imgs) {
                String imgSrc = img.attr("abs:src");
                System.out.println("imgSrc:"+imgSrc);
                next.add(imgSrc);
            }

        } else if (contentType.startsWith("image")) {
            // 如果是图片，直接下载
            String extensionName = contentType.split("/")[1];
            String imageFileName = getTimeCodeName() + "." + extensionName;
            File imageFile = new File(downloadDir, imageFileName);
            try {
                FileUtils.write(imageFile, page.content());
                System.out.println("保存图片 " + page.url() + " 到 " + imageFile.getAbsolutePath());
            } catch (IOException ex) {
                throw new RuntimeException(ex);
            }
        }

    }


    /**
     * @return create filename by system time
     */
    public static String getTimeCodeName(){
        Date d=new Date();
        DateFormat sdf=new SimpleDateFormat("yyMMddHHmmssSSS");
        String s=sdf.format(d);
        int code=(int) ((Math.random()*9+1)*100);
        String cods=s+code;
        return cods;
    }

}
